import urllib.request
from bs4 import BeautifulSoup
import re
import xlwt

def main():

# 获取网页
    def askURL(url):
        head = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36 Edg/95.0.1020.30"}
        request = urllib.request.Request(url=url, headers=head)
        html = ""
        try:
            response = urllib.request.urlopen(request)  #响应
            html = response.read().decode("UTF-8")      #保存网页
        except urllib.error.URLError as e:
            print("error")
        return html

    def getData(baseurl):
        datalist = []       #存储网页数据
        for i in range(0, 10):
            url = baseurl + str(i * 25)
            html = askURL(url)
            #逐一解析每一页
            soup = BeautifulSoup(html, "html.parser")
            for item in soup.find_all('div', class_="item"):
                data = []#保存一个item数据
                item = str(item)
                link = re.findall(findLink, item)[0]
                data.append(link)

                image = re.findall(findimage,item)[0]
                data.append(image)

                titles = re.findall(findtitle,item)
                if(len(titles) == 2 ):
                    ctitle = titles[0]
                    data.append(ctitle)
                    otitle = titles[1].replace("/","")
                    data.append(otitle)
                else:
                    data.append(titles[0])
                    data.append(' ')

                judge = re.findall(findjudege,item)
                data.append(judge)

                inq = re.findall(findInq,item)
                data.append(inq)
                datalist.append(data)
        print(datalist)
        return datalist

    #片名
    findtitle = re.compile(r'<span class="title">(.*)</span>')
    #链接
    findLink = re.compile(r'<a href="(.*?)">')
    #图片链接
    findimage = re.compile(r'<img.*src="(.*?)".*>',re.S)
    #概识
    findInq = re.compile(r'<span class="inq">(.*)</span>')
    #评价人数
    findjudege = re.compile(r'<span>(\d*.*)</span>')


    def saveData(datalist,savePath):
        print("save...")
        book = xlwt.Workbook(encoding="utf-8",style_compression=0)
        sheet = book.add_sheet("top250",cell_overwrite_ok=True)
        col = ("电影链接","电影图片链接","电影片名","电影英文名","评价人数","概识")
        for i in range(0,6):
            sheet.write(0,i,col[i])     #(行，列，数据)
        for i in range(0,250):
            print("第%d条"%i)
            data = datalist[i]
            for j in range(0,6):
                sheet.write(i+1,j,data[j])
        book.save(savePath)
        pass

    baseurl="https://movie.douban.com/top250?start="
    datalist=getData(baseurl)
    savePath="top250.xls"
    saveData(datalist,savePath)

if __name__ == "__main__":
    main()
pass